#ifdef ENABLE_CUDA_TUNE_PARAM

#include "../../CutlassGemmParam.hpp"

namespace MNN {
namespace CUDA {
using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Linear_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Linear_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Linear,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F16_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F16_Relu6_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    cutlass::half_t,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F16_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignCuda_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueCudaOp_F32_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_64x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 64>,
    cutlass::gemm::GemmShape<32, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_128x64x64 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 64>,
    cutlass::gemm::GemmShape<64, 32, 64>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    2,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_64x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<32, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    6,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_128x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 64, 32>,
    cutlass::gemm::GemmShape<64, 32, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_64x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<64, 128, 32>,
    cutlass::gemm::GemmShape<32, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    4,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_256x64x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<256, 64, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

using GemmTensor_F16_F32_Relu6_AlignTensor_Row_Column_Sm80_128x128x32 = cutlass::gemm::device::Gemm<
    cutlass::half_t,
    LayoutInputA,
    cutlass::half_t,
    LayoutInputB,
    float,
    LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm80,
    cutlass::gemm::GemmShape<128, 128, 32>,
    cutlass::gemm::GemmShape<64, 64, 32>,
    cutlass::gemm::GemmShape<16, 8, 16>,
    EpilogueTensorOp_F32_Relu6,
    SwizzleThreadBlock,
    3,
    128 / cutlass::sizeof_bits<cutlass::half_t>::value, 128 / cutlass::sizeof_bits<cutlass::half_t>::value, true>;

}
}
#endif